Assembling candidate taxa

Here is the script used to run the whole batch:

#!/bin/bash
#$ -t 1-109
#$ -tc 15
#$ -cwd
#$ -j yes
#$ -N comb_genus
#$ -pe mpi 4
#$ -l h_vmem=20G
set -e
counter=1

cd /mnt/shared/scratch/synology/nw42839/2019-11-04-focusdb/
while read genus species
do
    if [ "$counter" -eq "$SGE_TASK_ID" ]
    then
        conda activate 16db
        focusDB  -o ${genus}_genus --organism_name "${genus}" --njobs 6 --threads 2 --cores 6 --memory 20 --n_references 200 --n_SRAs 50 --maxdist .1 --focusDB_data /mnt/shared/scratch/nw42839/.focusDB/ -v 1 --timeout 1500 --fastqtool fastq-dump --process_partial --use_available --sge --sge_env 16db
    fi
    counter=$((counter + 1))
done < ~/GitHub/focusdb_manuscript/docs/datasets/combined_genuses.txt

Afterwards, the results were aggregated as follows:

mkdir 2019-11-11-focusdb
while read genus; do echo $genus; cp ./${genus}_genus/${genus}_ribo16s.fasta ./2019-11-11-results/ ; cp ./${genus}_genus/focusDB.log ./2019-11-11-results/${genus}.log ; cp ./${genus}_genus/sequence_summary.tab  ./2019-11-11-results/${genus}_seq_summary.tab; done < ~/GitHub/focusdb_manuscript/docs/datasets/combined_genuses.txt
cp --parents ./*_genus/*/results/riboSeed/seed/final_de_novo_assembly/contigs.fasta ./2019-11-11-results/
cat ./*_genus/SUMMARY > 2019-11-11-results/summary_all
cat ./2019-11-11-results/*seq_summary.tab > 2019-11-11-results/sequence_summary_all.tab
tar czf 2019-11-11-results.tar.gz 2019-11-11-results/

Suppl Figure 1: Number of SRAs available for each species

A python script sralist.py was used to generate a list of the number of SRAs for a given species, based on parsing the results from sraFind.

python scripts/sralist.py -s  ~/.focusDB/sraFind.tab -l ./docs/datasets/extreme_species_list.txt -o results/Extremes_sra_count
python scripts/sralist.py -s ../16db/sraFind-All-biosample-with-SRA-hits.txt -l ./docs/datasets/balanced_species_list.txt -o results/Balanced_sra_count
python scripts/sralist.py -s ../16db/sraFind-All-biosample-with-SRA-hits.txt -l ./docs/datasets/HMP_species_list.txt -o results/HMP_sra_count
python scripts/sralist.py -s ../16db/sraFind-All-biosample-with-SRA-hits.txt -l ./docs/microbiome_data/endo_species.txt -o results/endo_sra_count
srac <- data.frame("org"=NA, "count"=NA, stringsAsFactors = F)
srac <- srac[!is.na(srac),]
###
for (d in c("Extremes", "HMP", "Balanced")){
  thispath = file.path("results", paste0(d, "_sra_count"), "sralist.txt")
  tmp <- read.csv2(thispath, sep=":", col.names = c("org", "count"), stringsAsFactors = F, header=F)
  tmp$mock <- d
  srac <- rbind(srac, tmp)
}

srac$genus <- gsub("(.*) .*","\\1", srac$org)
srac$species <- gsub("(.*) (.*)","\\2", srac$org)
srac$binom <- paste(srac$genus, srac$species)
nunique_strains <- length(unique(srac[, "org"]))
nwith_sras <- length(unique(srac[srac$count > 0, "org"]))
(psrastrains <- ggplot(srac %>% filter(mock != "endo") %>% filter(count > 0), aes(reorder(org,count), y=count, fill=genus, color=genus)) + 
    geom_bar(stat="identity") +
    coord_flip() + 
    scale_color_discrete(guide=F) +
    scale_fill_discrete(guide=F) +
    facet_wrap(~mock, scales = "free") +
    labs(
      title="Distribution of Data Availability for 3 Mock Communities", 
      subtitle=paste0(nrow(srac %>% filter(count > 0)), " of ", nunique_strains, " organisms have availabe SRA data "),
      fill="Genus",
     color="Genus",
     y= "Number of WGS SRAs", x="Strain") + 
    mytheme +
    theme(legend.position = c(.8, .3), 
          text = element_text(size=8))
)

ggsave("docs/figures/S1-sras.png", psrastrains, device = "png", dpi = 300, width = 8, height = 5)
## add endo data
thispath = file.path("results", paste0("endo", "_sra_count"), "sralist.txt")
tmp <- read.csv2(thispath, sep=":", col.names = c("org", "count"), stringsAsFactors = F, header=F)
tmp$genus <- gsub("(.*) .*","\\1", tmp$org)
tmp$species <- gsub("(.*) (.*)","\\2", tmp$org)
tmp$binom <-paste(tmp$genus, tmp$species)
tmp$mock <- "endo"
srac <- rbind(srac, tmp)

Dataset overlap

species_names <- data.frame(genus=character(), species=character(), dataset=character(), stringsAsFactors = F)
for (f in c(Sys.glob("./docs/datasets/*species*"), "./docs/microbiome_data/endo_species.txt")){
  species_names <- rbind(
    species_names, 
    cbind(read.csv(f, sep=" ", stringsAsFactors = F, header=F, col.names = c("genus", "species")), dataset=f)
  )
  print(f)
}
## [1] "./docs/datasets/HMP_species_list.txt"
## [1] "./docs/datasets/balanced_species_list.txt"
## [1] "./docs/datasets/extreme_species_list.txt"
## [1] "./docs/microbiome_data/endo_species.txt"
unique_genuses <- unique(srac[, "genus"])
unique_genuses_with_sras <- unique(srac[srac$count > 0, "genus"])

proks <- read.csv2("~/.focusDB/prokaryotes.txt", sep="\t", header = T, stringsAsFactors = F)
proks$genus <- gsub("(.*?) .*", "\\1", proks$X.Organism.Name)
theseproks <- left_join(
  species_names, 
  proks %>% 
    filter(Status %in% c("Chromosome", "Complete Genome")) %>% 
    mutate(ref=T) %>%
    select(genus, ref) %>% distinct()
)  %>%
  transform(ref=ifelse(is.na(ref), F, ref)) %>%
  select(-species) %>%
  distinct() %>%
  mutate(sras = genus%in% unique_genuses_with_sras) %>%
  transform(dataset = case_when(
    grepl("balanced", dataset) ~ "Balanced",
    grepl("HMP", dataset) ~ "HMP",
    grepl("extreme", dataset) ~ "Extremes",
    grepl("endo", dataset) ~ "Endobiota"
  )) %>% 
  mutate(exclusion=case_when(
    !sras & !ref ~ "No SRAs or Reference",
    !sras  ~ "No SRAs",
    !ref ~ "No Reference",
    sras & ref ~ "Eligible"
  )
  )
## Joining, by = "genus"
(p_dataset_overlap <-  theseproks %>%   group_by(genus, exclusion) %>%
    summarize(ds = list(dataset)) %>% 
    ggplot(aes(x = ds, fill=exclusion)) +
    geom_bar(position=position_stack(reverse = TRUE)) + 
    labs(title="Genus-level overlap of datasets", 
         x="Dataset", y="Count", fill="") +
    scale_fill_manual(values=wesanderson::wes_palette("IsleofDogs2")[4:1]) +
    mytheme +
    scale_x_upset() +
    theme(
      legend.position=c(.6,.5),
      legend.background = element_blank(),
      legend.direction='vertical') +
    guides(fill = guide_legend(reverse = TRUE))
)

#species_names %>% ggplot(aes(x=dataset, y=genus)) + geom_tile()
unique_and_valid <- theseproks %>% select(genus, ref, sras) %>% distinct() %>%
  filter(sras) %>% filter(ref)
write.table(sort(unique_and_valid$genus), col.names = F, row.names = F, file = "./docs/datasets/combined_genuses.txt", quote=F)

ggsave("docs/figures/S6-datasets.png", p_dataset_overlap, device = "png", dpi = 300, width = 5, height = 4)

Identifying poor taxonomic assignments

tax_summary <- read.csv(
  file.path(results_dir, "sequence_summary_all.tab"),
  col.names = c("id", "path","source", "start", "end", "taxonomy", "perc", "nseqs", "shortname"), 
  header=FALSE, sep="\t", stringsAsFactors = FALSE)
tax_summary$top_level_perc <- as.numeric(gsub("(.*?);.*", "\\1", tax_summary$perc))
tax_summary$sra <- gsub("(.*?)_.*", "\\1", tax_summary$id)


questionable_annotations <- tax_summary[tax_summary$top_level_perc < 70, ]
write.table(unique(questionable_annotations$sra), file = "bad_sras.txt", row.names = FALSE, col.names = FALSE, quote=FALSE)

Figure 2: sra successes

summary_data <- read.csv(file.path(results_dir, "summary_all"),
                    sep="\t", header=F,
                    col.names = c("org", "status", "state", "messag"), 
                    stringsAsFactors = F)
# first, we drop anything with questionable annotations
table(summary_data$state %in% questionable_annotations$sra)
## 
## FALSE  TRUE 
##  2518    37
summary_data <- summary_data[!summary_data$state %in% questionable_annotations$sra, ]

# here we get rid of those withougt references by matching up with unique_and_valid
summary_data <- summary_data %>% filter(org %in% unique_and_valid$genus)
globals <- summary_data %>% filter(state=="global") %>%
  select(org, messag) %>% dplyr::rename(globalmessage=messag)
summary_data <- summary_data %>% filter(state!="global")
(globals$globalmessage <- ifelse(globals$globalmessage == "", "Success", globals$globalmessage))
##  [1] "Success"                                   
##  [2] "Success"                                   
##  [3] "Success"                                   
##  [4] "Success"                                   
##  [5] "Success"                                   
##  [6] "No 16s sequences detected in re-assemblies"
##  [7] "No 16s sequences detected in re-assemblies"
##  [8] "No 16s sequences detected in re-assemblies"
##  [9] "Success"                                   
## [10] "Success"                                   
## [11] "Success"                                   
## [12] "Success"                                   
## [13] "Success"                                   
## [14] "Success"                                   
## [15] "No 16s sequences detected in re-assemblies"
## [16] "Success"                                   
## [17] "Success"                                   
## [18] "Success"                                   
## [19] "No 16s sequences detected in re-assemblies"
## [20] "Success"                                   
## [21] "Success"                                   
## [22] "Success"                                   
## [23] "No 16s sequences detected in re-assemblies"
## [24] "Success"                                   
## [25] "No 16s sequences detected in re-assemblies"
## [26] "Success"                                   
## [27] "Success"                                   
## [28] "Success"                                   
## [29] "Success"                                   
## [30] "Success"                                   
## [31] "Success"                                   
## [32] "Success"                                   
## [33] "Success"                                   
## [34] "Success"                                   
## [35] "Success"                                   
## [36] "Success"                                   
## [37] "Success"                                   
## [38] "Success"                                   
## [39] "Success"                                   
## [40] "Success"                                   
## [41] "Success"                                   
## [42] "Success"                                   
## [43] "Success"                                   
## [44] "No 16s sequences detected in re-assemblies"
## [45] "Success"                                   
## [46] "Success"                                   
## [47] "No 16s sequences detected in re-assemblies"
## [48] "Success"                                   
## [49] "No 16s sequences detected in re-assemblies"
## [50] "Success"                                   
## [51] "No 16s sequences detected in re-assemblies"
## [52] "Success"                                   
## [53] "Success"                                   
## [54] "Success"                                   
## [55] "Success"                                   
## [56] "Success"                                   
## [57] "Success"                                   
## [58] "Success"                                   
## [59] "Success"                                   
## [60] "Success"                                   
## [61] "Success"                                   
## [62] "Success"                                   
## [63] "Success"                                   
## [64] "Success"                                   
## [65] "Success"                                   
## [66] "Success"                                   
## [67] "Success"                                   
## [68] "Success"                                   
## [69] "Success"                                   
## [70] "Success"                                   
## [71] "No 16s sequences detected in re-assemblies"
## [72] "Success"                                   
## [73] "Success"                                   
## [74] "Success"                                   
## [75] "No 16s sequences detected in re-assemblies"
## [76] "Success"                                   
## [77] "Success"                                   
## [78] "Success"                                   
## [79] "Success"                                   
## [80] "No 16s sequences detected in re-assemblies"
## [81] "Success"                                   
## [82] "Success"                                   
## [83] "Success"                                   
## [84] "Success"                                   
## [85] "No 16s sequences detected in re-assemblies"
# tidy things up;  this clarifies the immage a bit
(globals$globalmessage <- gsub("No 16s sequences detected in re-assemblies", "Success", globals$globalmessage))
##  [1] "Success" "Success" "Success" "Success" "Success" "Success" "Success"
##  [8] "Success" "Success" "Success" "Success" "Success" "Success" "Success"
## [15] "Success" "Success" "Success" "Success" "Success" "Success" "Success"
## [22] "Success" "Success" "Success" "Success" "Success" "Success" "Success"
## [29] "Success" "Success" "Success" "Success" "Success" "Success" "Success"
## [36] "Success" "Success" "Success" "Success" "Success" "Success" "Success"
## [43] "Success" "Success" "Success" "Success" "Success" "Success" "Success"
## [50] "Success" "Success" "Success" "Success" "Success" "Success" "Success"
## [57] "Success" "Success" "Success" "Success" "Success" "Success" "Success"
## [64] "Success" "Success" "Success" "Success" "Success" "Success" "Success"
## [71] "Success" "Success" "Success" "Success" "Success" "Success" "Success"
## [78] "Success" "Success" "Success" "Success" "Success" "Success" "Success"
## [85] "Success"
# (globals$globalmessage <- gsub("No available.*", "No available references", globals$globalmessage))

p_summary_data_pre <- full_join(
  summary_data , globals, by=c("org")) %>% 
  select(org, globalmessage, messag, state ) %>%  distinct()

p_summary_data <- right_join(srac,  p_summary_data_pre, by=c("org")) %>% 
  mutate(
    strain=org, 
    pmessage = case_when(
      startsWith(messag, "Unknown error selecting reference") ~ "Error",
      is.na(messag) ~ "QC Fail",
      startsWith(messag, "Library error") ~ "QC Fail",
      startsWith(messag, "Reads under") ~ "QC Fail",
      startsWith(messag, "Reads over") ~ "QC Fail",
      startsWith(messag, "No reference meets threshold") ~ "No close reference",
      startsWith(messag, "Reads under the") ~ "QC Fail", 
      startsWith(messag, "Insufficient coverage") ~ "QC Fail", 
      startsWith(messag, "Unknown crit") ~ "Error",
      startsWith(messag, "Error downloading") ~ "Error",
      messag=="" ~ "16S Recovered", 
      messag=="riboSeed unsuccessful" ~ "No 16S Recovered")) 

# #  plot mock also in shared methods
# for (grp in  c("Extremes",  "Balanced", "Endobiota")){
#   thisgroup <- quo(grp)
#   (mock_plots[[grp]] <- plot_mock(df=p_summary_data%>% filter(mock == (!!thisgroup))))
#   ggsave(paste0("docs/figures/S2", grp, ".png"), mock_plots[[grp]], device = "png", dpi = 300, width = 12, height = 7)
# }
(mock_plots<- plot_genera(df=p_summary_data))

ggsave(paste0("docs/figures/2-genera.png"), plot_genera(df=p_summary_data), device = "png", dpi = 300, width = 10,  height = 13)
# ###################
# alpha_data <- p_summary_data %>%
#   group_by(strain) %>% 
#   mutate(alpha = ifelse(globalmessage=="Success", "gray10", "gray70")) %>% 
#   select(strain, alpha) %>% distinct() %>% as.data.frame()
# p_summary_data_B <- p_summary_data  %>%  arrange(pmessage) %>%
#     group_by(strain) %>%
#     mutate(thisid = row_number()) %>%
#     transform(thisid = ifelse(is.na(messag), 0, thisid)) 
# p_summary_data_B$strain <- factor(p_summary_data_B$strain, levels=rev(levels(factor(p_summary_data_B$strain))))
# 
# alpha_v <- alpha_data[order(alpha_data$strain, alpha_data$alpha), "alpha"]
# (p_outcomesB <- ggplot(
#   p_summary_data_B, 
#   aes(x=strain, y=thisid, shape=pmessage,
#       color=pmessage, fill=pmessage)) + 
#     scale_shape_manual(guide = "legend", values = c(21, 22, 23,24)) + 
#     geom_point(size=5)  + coord_flip() + mytheme +
#     scale_colour_brewer(palette = "Set2") +
#     scale_fill_brewer(palette = "Set2") +
#     # unfortuantely facet wrapping throws off our beautiful alpha'd out names
#     # due to those repeated
#     #facet_wrap(~mock, nrow = 2, scales = "free") +
#     labs("SRAs Proccessed by focusDB",
#          y="Number of whole-genome sequencing SRAs", x="", color="Per SRA", fill="Per SRA", shape="Per SRA")  +
#     theme(axis.text=element_text(size=13),
#           axis.title.x =element_text(size=14),
#           axis.text.y = element_text(colour = alpha_v))
# )  
# ggsave("docs/figures/2-success.png", p_outcomesB, device = "png", dpi = 300, width = 12, height = 7)

Assessing uniqueness:

First, for simplicity, we combine all the sequences for all the runs. We have the logs to relate the seqeunces to why they were selected, but as we are talking about improving the database as a whole, combining them all simplifies aspects of interpretation.

Note that first we remove any with dodgey taxonomic annotations

cat ./results/*/*full_ribo16s.fasta  > ./results/pre_full_focusDB_ribo16s.fasta
cat ./results/*/*fast_ribo16s.fasta  > ./results/pre_fast_focusDB_ribo16s.fasta
for (f in c("full", "fast")){
  lines <- readLines(paste0("./results/pre_", f,"_focusDB_ribo16s.fasta"))
  newlines = c()
  skip <- 0 
  for (i in 1:length(lines)){
    if (any(sapply(unique(questionable_annotations$sra), grepl, lines[i]))){
      skip <- 2
    }
    if (skip > 0){
      skip <- skip - 1
    } else {
      newlines <- c(newlines, lines[i])
    }
  }
  writeLines(newlines, paste0("./results/", f,"_focusDB_ribo16s.fasta"))
}

Assessing SILVA’s composition

if [ -f "~/Downloads/SILVA_132_SSURef_tax_silva.fasta.gz" ] ; then echo "already downloaded" ; else wget    https://www.arb-silva.de/fileadmin/silva_databases/release_132/Exports/SILVA_132_SSURef_tax_silva.fasta.gz ; fi

First, we read in the silva db, and parse the headers for alter comparison. One thing we do it get the codes of what the prefixes from NCBI mean, and use that to determine with source of the sequences.

process_db <- function(xset){
  # codes_from_ncbi read in shared.R
  headers <- data.frame("raw"=names(xset), stringsAsFactors = F)
  headers$pre <- gsub("(\\D*).*", "\\1", headers$raw)
  headers$id <- gsub("(.*?)\\.(.*)", "\\1", headers$raw)
  headers$strain <- gsub("(.*?)\\.(.*?)\\.(.*)\\s.*;(.*)", "\\4", headers$raw)
  headers$genus <- gsub("(.*?)\\s(.*)", "\\1", headers$strain)
  headers$species <- gsub("(.*?) (.*)", "\\1", gsub("(.*?)\\s(.*)", "\\2", headers$strain))
  headers$binom <- paste(headers$genus, headers$species)
  #headers$name <- gsub(">(.*?)\\s.*", "\\1", headers$raw)
  headers$start <- as.numeric(gsub("(.*)\\.(.+)\\.(.*?)\\s.*", "\\2", headers$raw))
  headers$stop <-  as.numeric(gsub("(.*)\\.(.+)\\.(.*?)\\s.*", "\\3", headers$raw))
  headers$wgs <- nchar(headers$pre) > 3 # see table
  headers$amplicon <- headers$start < 10
  
  
  genome_pres <- codes_from_ncbi %>% 
    filter(Type=="Genome project data") %>% 
    mutate(Prefix = strsplit(Prefix, ",")) %>% unnest(Prefix) %>%
    transform(Prefix=gsub(" ", "", Prefix)) %>% 
    select(Prefix) 
  patent_pres <- codes_from_ncbi %>% filter(grepl("Patent", Type)) %>%
    mutate(Prefix = strsplit(Prefix, ",")) %>% unnest(Prefix) %>%
    transform(Prefix=gsub(" ", "", Prefix)) %>% 
    select(Prefix) 
  headers$cg <- headers$pre %in% genome_pres$Prefix
  headers$patent <- headers$pre %in% patent_pres$Prefix
  headers$htgs <- headers$pre %in% c("AC", "DP")
  
  table(headers$cg) / nrow(headers)
  table(headers$wgs) / nrow(headers)
  table(headers$amplicon) / nrow(headers)
  
  headers$category <- case_when(
    headers$wgs ~ "Draft",
    headers$htgs ~ "HTGS",
    headers$patent ~ "Patent",
    headers$cg ~ "Complete Genome",
    headers$amplicon ~  "Amplicon")
  round(table(headers$category, useNA = "always") / nrow(headers) * 100, 2)
  headers$matcher <- ifelse(
    headers$cg, 
    headers$id,
    ifelse(headers$wgs, 
           substr(headers$id, start = 1, stop =( nchar(headers$pre) + 2)),
           NA)
  )
  return( headers )
}
#datasets <- list()  #first item string set, second is the header info
silva <- readRNAStringSet("~/Downloads/SILVA_132_SSURef_tax_silva.fasta.gz")

# first, we need to exclude any bad sequences
all_focus_full <- readRNAStringSet("./results/full_focusDB_ribo16s.fasta")
all_focus_fast <- readRNAStringSet("./results/fast_focusDB_ribo16s.fasta")
de_novo <-  readRNAStringSet("./results/de_novo_seqs.fasta")


datasets <-  list(
  silva = list(set = silva, metadata = process_db(xset=silva)),
  fast =  list(set = all_focus_fast, metadata = process_db(xset=all_focus_fast)), 
  full =  list(set = all_focus_full, metadata = process_db(xset=all_focus_full)),
  denovo = list(set = de_novo, metadata = process_db(xset=de_novo))
)
uniqueness <- data.frame("Organism"=NA, "Unique"=NA,  "Duplicated"=NA, "DB"=NA, stringsAsFactors = F)
uniqueness <- uniqueness[!is.na(uniqueness),]
# note that we only get the species level resultts here, later we address the genus level

#all_focus <- all_focus[!duplicated(all_focus), ]
for (i in c("silva", "full", "fast")){
  set <- datasets[[i]]
  raw_len <- length(set[["set"]])
  print(raw_len)
  thisset <- set[["set"]][lapply(vmatchPattern("N", set[["set"]])@ends, length) == 0]
  thisset <- thisset[lengths(thisset) > 1200]
  no_n_len <- length(thisset)
  if (no_n_len != raw_len) warning(paste0(" ",raw_len-no_n_len, " sequences with ambiguous nucleotides from the '", i, "' dataset"))
}
## [1] 2090668
## Warning: 120227 sequences with ambiguous nucleotides from the 'silva'
## dataset
## [1] 4789
## Warning: 23 sequences with ambiguous nucleotides from the 'full' dataset
## [1] 5738
counter <- 1
for (organism in unique(datasets[["fast"]][["metadata"]]$binom)){
  #rganism <- gsub("_", " ", gsub(".*\\/(.*)_ribo16s.fasta", "\\1", path))
  print(paste("processing", organism, counter, "of", length(unique(datasets[["fast"]][["metadata"]]$binom))))
  silva_subset <-silva[grepl(organism, names(datasets[["silva"]][["set"]]))]
  nsilv <- length(silva_subset)
  nsilvdup = sum(duplicated(silva_subset), na.rm = TRUE)
  uniqueness <- rbind(
    uniqueness, 
    data.frame("Organism"=organism, "Unique"=nsilv-nsilvdup,  "Duplicated"=nsilvdup, "DB"="silv"))
  # focus_subset <- readDNAStringSet("./results/2019-10-12-focusdb-extremes/Bacteroides_fragilis_ribo16s.fasta")
  focus_subset <- datasets[["fast"]][["set"]][grepl(organism, names(datasets[["fast"]][["set"]]))] 
  nfoc <- length(focus_subset)
  nfocdup = sum(duplicated(focus_subset), na.rm = TRUE)
  uniqueness <- rbind(
    uniqueness, 
    data.frame("Organism"=organism, "Unique"=nfoc-nfocdup,  "Duplicated"=nfocdup, "DB"="focus"))
  both_subsets  <- append(DNAStringSet(silva_subset), focus_subset)
  nboth <- length(both_subsets)
  nbothdup = sum(duplicated(both_subsets), na.rm = TRUE)
  uniqueness <- rbind(
    uniqueness, 
    data.frame("Organism"=organism, "Unique"=nboth-nbothdup,  "Duplicated"=nboth, "DB"="combined"))
  counter <- counter + 1
}
## [1] "processing Acinetobacter baumannii 1 of 223"
## [1] "processing Acinetobacter pittii 2 of 223"
## [1] "processing Acinetobacter nosocomialis 3 of 223"
## [1] "processing Acinetobacter junii 4 of 223"
## [1] "processing Acinetobacter bereziniae 5 of 223"
## [1] "processing Actinomyces oris 6 of 223"
## [1] "processing Actinomyces sp. 7 of 223"
## [1] "processing Aerococcus urinae 8 of 223"
## [1] "processing Aerococcus sanguinicola 9 of 223"
## [1] "processing Aerococcus viridans 10 of 223"
## [1] "processing Aerococcus urinaeequi 11 of 223"
## [1] "processing Aerococcus christensenii 12 of 223"
## [1] "processing Aerococcus urinaehominis 13 of 223"
## [1] "processing Akkermansia muciniphila 14 of 223"
## [1] "processing Alistipes finegoldii 15 of 223"
## [1] "processing Alistipes shahii 16 of 223"
## [1] "processing Bacillus cereus 17 of 223"
## [1] "processing Bacillus anthracis 18 of 223"
## [1] "processing Bacillus licheniformis 19 of 223"
## [1] "processing Bacillus thuringiensis 20 of 223"
## [1] "processing Bacillus subtilis 21 of 223"
## [1] "processing Bacillus megaterium 22 of 223"
## [1] "processing Bacillus safensis 23 of 223"
## [1] "processing Bacillus velezensis 24 of 223"
## [1] "processing Bacteroides fragilis 25 of 223"
## [1] "processing Bacteroides caccae 26 of 223"
## [1] "processing Bacteroides zoogleoformans 27 of 223"
## [1] "processing Bacteroides thetaiotaomicron 28 of 223"
## [1] "processing Bacteroides dorei 29 of 223"
## [1] "processing Bacteroides helcogenes 30 of 223"
## [1] "processing Bacteroides ovatus 31 of 223"
## [1] "processing Bacteroides caecimuris 32 of 223"
## [1] "processing Bacteroides vulgatus 33 of 223"
## [1] "processing Bacteroides cellulosilyticus 34 of 223"
## [1] "processing Bifidobacterium adolescentis 35 of 223"
## [1] "processing Bifidobacterium longum 36 of 223"
## [1] "processing Bifidobacterium bifidum 37 of 223"
## [1] "processing Bifidobacterium animalis 38 of 223"
## [1] "processing Bifidobacterium actinocoloniiforme 39 of 223"
## [1] "processing Bifidobacterium asteroides 40 of 223"
## [1] "processing Bifidobacterium dentium 41 of 223"
## [1] "processing Bifidobacterium pseudocatenulatum 42 of 223"
## [1] "processing Bifidobacterium pseudolongum 43 of 223"
## [1] "processing Bifidobacterium coryneforme 44 of 223"
## [1] "processing Bifidobacterium choerinum 45 of 223"
## [1] "processing Bifidobacterium breve 46 of 223"
## [1] "processing Bifidobacterium scardovii 47 of 223"
## [1] "processing Bifidobacterium angulatum 48 of 223"
## [1] "processing Blautia hansenii 49 of 223"
## [1] "processing Blautia sp. 50 of 223"
## [1] "processing Bordetella bronchiseptica 51 of 223"
## [1] "processing Bordetella pertussis 52 of 223"
## [1] "processing Bordetella holmesii 53 of 223"
## [1] "processing Burkholderia territorii 54 of 223"
## [1] "processing Burkholderia pseudomallei 55 of 223"
## [1] "processing Burkholderia ubonensis 56 of 223"
## [1] "processing Burkholderia cenocepacia 57 of 223"
## [1] "processing Burkholderia multivorans 58 of 223"
## [1] "processing Burkholderia stagnalis 59 of 223"
## [1] "processing Burkholderia cepacia 60 of 223"
## [1] "processing Burkholderia sp. 61 of 223"
## [1] "processing Burkholderia thailandensis 62 of 223"
## [1] "processing Capnocytophaga leadbetteri 63 of 223"
## [1] "processing Capnocytophaga sputigena 64 of 223"
## [1] "processing Capnocytophaga cynodegmi 65 of 223"
## [1] "processing Capnocytophaga ochracea 66 of 223"
## [1] "processing Capnocytophaga canimorsus 67 of 223"
## [1] "processing Capnocytophaga sp. 68 of 223"
## [1] "processing Capnocytophaga haemolytica 69 of 223"
## [1] "processing Capnocytophaga gingivalis 70 of 223"
## [1] "processing Chryseobacterium glaciei 71 of 223"
## [1] "processing Chryseobacterium indologenes 72 of 223"
## [1] "processing Chryseobacterium sp. 73 of 223"
## [1] "processing Chryseobacterium taklimakanense 74 of 223"
## [1] "processing Clostridium perfringens 75 of 223"
## [1] "processing Bacillus coagulans 76 of 223"
## [1] "processing Clostridium botulinum 77 of 223"
## [1] "processing Clostridium ljungdahlii 78 of 223"
## [1] "processing Clostridium estertheticum 79 of 223"
## [1] "processing Clostridium beijerinckii 80 of 223"
## [1] "processing Collinsella aerofaciens 81 of 223"
## [1] "processing Corynebacterium striatum 82 of 223"
## [1] "processing Corynebacterium jeikeium 83 of 223"
## [1] "processing Corynebacterium diphtheriae 84 of 223"
## [1] "processing Corynebacterium sp. 85 of 223"
## [1] "processing Corynebacterium simulans 86 of 223"
## [1] "processing Cutibacterium avidum 87 of 223"
## [1] "processing Cutibacterium acnes 88 of 223"
## [1] "processing Deinococcus radiodurans 89 of 223"
## [1] "processing Deinococcus swuensis 90 of 223"
## [1] "processing Desulfovibrio alaskensis 91 of 223"
## [1] "processing Desulfovibrio sp. 92 of 223"
## [1] "processing Desulfovibrio hydrothermalis 93 of 223"
## [1] "processing Enterobacter cloacae 94 of 223"
## [1] "processing Enterobacter hormaechei 95 of 223"
## [1] "processing Enterobacter bugandensis 96 of 223"
## [1] "processing Enterobacteriaceae bacterium 97 of 223"
## [1] "processing Enterobacter asburiae 98 of 223"
## [1] "processing Enterococcus faecalis 99 of 223"
## [1] "processing Enterococcus faecium 100 of 223"
## [1] "processing Escherichia coli 101 of 223"
## [1] "processing Eubacterium limosum 102 of 223"
## [1] "processing Faecalibacterium prausnitzii 103 of 223"
## [1] "processing Faecalitalea cylindroides 104 of 223"
## [1] "processing Finegoldia magna 105 of 223"
## [1] "processing Flavonifractor plautii 106 of 223"
## [1] "processing Fusobacterium necrophorum 107 of 223"
## [1] "processing Fusobacterium ulcerans 108 of 223"
## [1] "processing Fusobacterium nucleatum 109 of 223"
## [1] "processing Fusobacterium varium 110 of 223"
## [1] "processing Fusobacterium gonidiaformans 111 of 223"
## [1] "processing Fusobacterium mortiferum 112 of 223"
## [1] "processing Fusobacterium periodonticum 113 of 223"
## [1] "processing Gardnerella vaginalis 114 of 223"
## [1] "processing Gemella morbillorum 115 of 223"
## [1] "processing Haemophilus influenzae 116 of 223"
## [1] "processing Haemophilus parainfluenzae 117 of 223"
## [1] "processing Hafnia alvei 118 of 223"
## [1] "processing Helicobacter pylori 119 of 223"
## [1] "processing Helicobacter cinaedi 120 of 223"
## [1] "processing Herbaspirillum hiltneri 121 of 223"
## [1] "processing Herbaspirillum seropedicae 122 of 223"
## [1] "processing Hydrogenobaculum sp. 123 of 223"
## [1] "processing Intestinimonas butyriciproducens 124 of 223"
## [1] "processing Klebsiella pneumoniae 125 of 223"
## [1] "processing Klebsiella variicola 126 of 223"
## [1] "processing Klebsiella aerogenes 127 of 223"
## [1] "processing Klebsiella quasipneumoniae 128 of 223"
## [1] "processing Lactobacillus gasseri 129 of 223"
## [1] "processing Lactobacillus plantarum 130 of 223"
## [1] "processing Lactobacillus rhamnosus 131 of 223"
## [1] "processing Lactobacillus jensenii 132 of 223"
## [1] "processing Lactobacillus reuteri 133 of 223"
## [1] "processing Lactobacillus paracollinoides 134 of 223"
## [1] "processing Lactobacillus paracasei 135 of 223"
## [1] "processing Lactobacillus casei 136 of 223"
## [1] "processing Lactobacillus buchneri 137 of 223"
## [1] "processing Lactobacillus acetotolerans 138 of 223"
## [1] "processing Listeria monocytogenes 139 of 223"
## [1] "processing Staphylococcus epidermidis 140 of 223"
## [1] "processing Mesorhizobium amorphae 141 of 223"
## [1] "processing Mesorhizobium loti 142 of 223"
## [1] "processing Mesorhizobium ciceri 143 of 223"
## [1] "processing Methylobacterium radiotolerans 144 of 223"
## [1] "processing Methylobacterium sp. 145 of 223"
## [1] "processing Methylobacterium phyllosphaerae 146 of 223"
## [1] "processing Mycoplasma pneumoniae 147 of 223"
## [1] "processing Mycoplasma cloacale 148 of 223"
## [1] "processing Mycoplasma anatis 149 of 223"
## [1] "processing Mycoplasma bovis 150 of 223"
## [1] "processing Mycoplasma gallisepticum 151 of 223"
## [1] "processing Neisseria meningitidis 152 of 223"
## [1] "processing Neisseria sicca 153 of 223"
## [1] "processing Neisseria gonorrhoeae 154 of 223"
## [1] "processing Nitrosomonas sp. 155 of 223"
## [1] "processing Nitrosomonas ureae 156 of 223"
## [1] "processing Nitrosomonas communis 157 of 223"
## [1] "processing Nitrosomonas eutropha 158 of 223"
## [1] "processing Nitrosomonas europaea 159 of 223"
## [1] "processing Nostoc sp. 160 of 223"
## [1] "processing Achromobacter denitrificans 161 of 223"
## [1] "processing Nostoc punctiforme 162 of 223"
## [1] "processing Nostoc linckia 163 of 223"
## [1] "processing Odoribacter splanchnicus 164 of 223"
## [1] "processing Parabacteroides sp. 165 of 223"
## [1] "processing Parabacteroides distasonis 166 of 223"
## [1] "processing Paracoccus aminovorans 167 of 223"
## [1] "processing Paracoccus denitrificans 168 of 223"
## [1] "processing Paracoccus mutanolyticus 169 of 223"
## [1] "processing Paracoccus yeei 170 of 223"
## [1] "processing Anaerococcus mediterraneensis 171 of 223"
## [1] "processing Ezakiella massiliensis 172 of 223"
## [1] "processing Acidaminococcus fermentans 173 of 223"
## [1] "processing Porphyromonas gingivalis 174 of 223"
## [1] "processing Porphyromonas crevioricanis 175 of 223"
## [1] "processing Prevotella ruminicola 176 of 223"
## [1] "processing Prevotella scopos 177 of 223"
## [1] "processing Prevotella denticola 178 of 223"
## [1] "processing Pseudomonas aeruginosa 179 of 223"
## [1] "processing Pseudomonas veronii 180 of 223"
## [1] "processing Pseudomonas syringae 181 of 223"
## [1] "processing Pseudomonas putida 182 of 223"
## [1] "processing Pseudomonas koreensis 183 of 223"
## [1] "processing Pseudomonas fluorescens 184 of 223"
## [1] "processing Pseudomonas protegens 185 of 223"
## [1] "processing Pseudomonas orientalis 186 of 223"
## [1] "processing Pseudomonas stutzeri 187 of 223"
## [1] "processing Rhodobacter sphaeroides 188 of 223"
## [1] "processing Rhodobacter capsulatus 189 of 223"
## [1] "processing Rhodovulum sulfidophilum 190 of 223"
## [1] "processing Rothia mucilaginosa 191 of 223"
## [1] "processing Rothia dentocariosa 192 of 223"
## [1] "processing Ruminococcus albus 193 of 223"
## [1] "processing Salinispora arenicola 194 of 223"
## [1] "processing Salinispora tropica 195 of 223"
## [1] "processing Shewanella baltica 196 of 223"
## [1] "processing Shewanella oneidensis 197 of 223"
## [1] "processing Shewanella sp. 198 of 223"
## [1] "processing Shewanella frigidimarina 199 of 223"
## [1] "processing Shewanella algae 200 of 223"
## [1] "processing Shewanella japonica 201 of 223"
## [1] "processing Staphylococcus aureus 202 of 223"
## [1] "processing Staphylococcus pseudintermedius 203 of 223"
## [1] "processing Staphylococcus lugdunensis 204 of 223"
## [1] "processing Staphylococcus argenteus 205 of 223"
## [1] "processing Stenotrophomonas maltophilia 206 of 223"
## [1] "processing Stenotrophomonas sp. 207 of 223"
## [1] "processing Streptococcus pneumoniae 208 of 223"
## [1] "processing Streptococcus agalactiae 209 of 223"
## [1] "processing Streptococcus suis 210 of 223"
## [1] "processing Streptococcus gordonii 211 of 223"
## [1] "processing Streptococcus pyogenes 212 of 223"
## [1] "processing Sulfitobacter sp. 213 of 223"
## [1] "processing Thermus thermophilus 214 of 223"
## [1] "processing Thermus oshimai 215 of 223"
## [1] "processing Treponema pedis 216 of 223"
## [1] "processing Treponema brennaborense 217 of 223"
## [1] "processing Treponema caldarium 218 of 223"
## [1] "processing Homo sapiens 219 of 223"
## [1] "processing Treponema pallidum 220 of 223"
## [1] "processing Treponema succinifaciens 221 of 223"
## [1] "processing Turicibacter sp. 222 of 223"
## [1] "processing Veillonella parvula 223 of 223"
#uniqueness <- uniqueness[1105:nrow(uniqueness), ]
# set an arbtrary cutoff for popularity, just for plotting purposes
popthresh = 100; popular <- uniqueness %>% filter(DB=='silv') %>% filter(Unique > popthresh | Duplicated > popthresh) 
#popular$Organism <- as.character(popular$Organism)
p_uniqueness <- uniqueness %>% distinct() %>% gather(key="grp", value="val", -Organism, -DB ) %>%
  spread(key=DB, value=val) %>% 
  mutate(SILVA=combined-focus, focusDB=combined-silv) %>% 
  select(-silv, -combined, -focus)  %>%
  gather(key="DB", value="val", -Organism, -grp ) %>%
  # reverse levels for plottting
  mutate(Organism = factor(Organism), 
         Organism = factor(Organism, levels = rev(levels(Organism))))
(p_unique <- ggplot(
  p_uniqueness, 
  #%>% mutate(fac = ifelse(Organism %in% popular$Organism, "Over-represented", "Under-represented")),
  aes(x=Organism, y=val, alpha=grp, group=grp, fill=DB)) +
    coord_flip() +
    #facet_wrap(~fac, scales="free") +
    scale_fill_manual(values = viridis::viridis(n=3)[c(2, 1)]) +
    scale_alpha_manual(values = c(.2, 1)) + mytheme +
    labs(y="Number of Sequence", x="", alpha="", fill="Database") +
    theme(axis.text=element_text(size=13)) +
    geom_bar(stat="identity", position="stack") #+ facet_wrap(~grp) 
)

(p_unique_alt <- ggplot(
  p_uniqueness %>% 
    #mutate(fac = ifelse(Organism %in% popular$Organism, "Over-represented", "Under-represented"))%>% 
    filter(grp=="Unique"),
  aes(x=Organism, y=val,  fill=DB)) +
    coord_flip() +
    #facet_wrap(~fac, scales="free") +
    scale_fill_manual(values = viridis::viridis(n=3)[c(2, 1)]) +
    scale_alpha_manual(values = c(.2, 1)) + mytheme +
    labs(y="Number of Sequence", x="", alpha="", fill="Database") +
    theme(axis.text=element_text(size=13)) +
    geom_bar(stat="identity", position="stack") #+ facet_wrap(~grp) 
)

p_unique_perc <-  
  p_uniqueness %>% spread(grp, val) %>%
    dplyr::rename(binom=Organism) %>%
    select(-Duplicated) %>% spread(DB, Unique) %>% 
    mutate(
      perc_inc=round((focusDB/SILVA) *100, 1 ),
      perc_of_tot=round(focusDB/((SILVA + focusDB)) *100, 1 ),
    )%>% 
  filter(!is.na(perc_inc)) %>% ungroup() %>%
  mutate(genus = gsub("(.*) .*", "\\1", binom)) %>%
  group_by(genus) %>%
  mutate(
    perc_inc_genus=round((sum(focusDB)/sum(SILVA)) *100, 1 ),
    perc_of_tot_genux=round(sum(focusDB)/((sum(SILVA) + sum(focusDB))) *100, 1 ))


# (p_unique_heat <- ggplot(p_unique_perc,
#   aes(x=binom, y=mock, fill=mock)) +
#     coord_flip() +
#     scale_fill_discrete(guide="none")+
#     theme(rect = element_blank(),
#           text=element_text(color="white"),
#           axis.text.y  = element_blank(),
#           axis.text.x  = element_text(size=10),
#           line = element_blank(),
#             
#       )+
#   geom_tile())
(p_unique_species <- ggplot(p_unique_perc %>% distinct(),
  aes(x=binom, y=perc_inc)) +
    coord_flip() +
    scale_fill_manual(values = viridis::viridis(n=3)) +
    mytheme +
    labs(y="Percent increase in Unique Sequences", x="", alpha="", fill="Database") +
    theme(axis.text=element_text(size=13)) +
    geom_bar(stat="identity") #+ facet_wrap(~grp) 
)

(p_unique_genus <- ggplot(
  p_unique_perc %>% select(genus, perc_inc_genus) %>% distinct(),
  aes(x=reorder(genus, perc_inc_genus), y=perc_inc_genus)) +
    coord_flip() +
    scale_fill_manual(values = viridis::viridis(n=3)) +
    mytheme +
    labs(y="Percent increase in Unique Sequences", x="", alpha="", fill="Database") +
    theme(axis.text=element_text(size=13)) +
    geom_bar(stat="identity") #+ facet_wrap(~grp) 
)

#(figure <- ggpubr::ggarrange(p_unique_heat, p_uniqueV, widths=c(1, 3)))
ggsave("docs/figures/3-unique.png", p_unique, device = "png", dpi = 300, width = 12, height = 10) 
# length(unique(p_unique_perc$genus))
ggsave("docs/figures/3-perc_unique.png", p_unique_species, device = "png", dpi = 300, width = 12, height = 10) 
ggsave("docs/figures/3-perc_unique_genus.png", p_unique_genus, device = "png", dpi = 300, width = 12, height = 10) 
ggsave("docs/figures/3-unique_alt2.png", p_unique_alt, device = "png", dpi = 300, width = 12, height = 10)
#ggsave("docs/figures/3-unique_alt.png", figure, device = "png", dpi = 300, width = 12, height = 10)

Here, we generate a table showing how many sequences (and unique seqeunces) are obtained from each level. This is ued wiith table X in the paper

A <- uniqueness %>% select(Organism, Unique, DB ) %>% distinct() %>%spread(key=DB, value=Unique)
B <- p_summary_data %>% select(strain, pmessage, state) %>% filter(!is.na(state)) %>% group_by(strain) %>% summarise(All_SRAs=n(), Good_SRAs=table(pmessage)["16S Recovered"]) 
right_join(B, A, by=c("strain"="Organism")) %>% 
  #filter(!is.na(All_SRAs)) %>%
  mutate(Genus=gsub("(.*) (.*)", "\\1", strain),
         Species=gsub("(.*) (.*)", "\\2", strain)) %>%
  select(Genus, Species, All_SRAs, Good_SRAs, focus, silv, combined)
## Warning: Column `strain`/`Organism` joining character vector and factor,
## coercing into character vector
## # A tibble: 223 x 7
##    Genus         Species      All_SRAs Good_SRAs focus  silv combined
##    <chr>         <chr>           <int>     <int> <int> <int>    <int>
##  1 Acinetobacter baumannii          NA        NA    41   883      924
##  2 Acinetobacter pittii             NA        NA     9    80       89
##  3 Acinetobacter nosocomialis       NA        NA     6    31       37
##  4 Acinetobacter junii              NA        NA     4   128      132
##  5 Acinetobacter bereziniae         NA        NA     5    25       30
##  6 Actinomyces   oris               NA        NA     5    29       34
##  7 Actinomyces   sp.                NA        NA     7   234      241
##  8 Aerococcus    urinae             NA        NA   104    57      161
##  9 Aerococcus    sanguinicola       NA        NA     9     8       17
## 10 Aerococcus    viridans           NA        NA     4    59       63
## # … with 213 more rows
 # %>% mutate(perc  = focus/silv*100) %>% select(perc) %>% summary()

Generating Multiple Sequence Alignments

Defining the regions of 16S in E coli

The coordinates for the common 16S primers correspond to E coli, so we selected this sequence. Reference: - https://www.ncbi.nlm.nih.gov/nuccore/NC_000913.3?report=fasta&from=4166659&to=4168200 - the regions https://bmcbioinformatics.biomedcentral.com/track/pdf/10.1186/s12859-016-0992- These are implemented in the calculate-shannon-entropy script.

# Combining sequences
while read genus species; do if [ -f "./results/2019-09-26-focusdb-results/${genus}_${species}_ribo16s.fasta" ] ; then combine-focusdb-and-silva -d ~/Downloads/SILVA_132_SSURef_tax_silva.fasta -n "$genus $species" -S  ./results/2019-09-26-focusdb-results/${genus}_${species}_ribo16s.fasta --dna --lower  > ./results/extracted_sequences/${genus}_${species}_db.fasta ; fi; done < ./docs/extreme_species_list.txt
# alignment + trimming
while read genus species; do if [ -f "./results/2019-09-26-focusdb-results/${genus}_${species}_ribo16s.fasta" ] ; then align-and-trim-focusdb --i ./results/extracted_sequences/${genus}_${species}_db.fasta -o ./results/extracted_sequences/${genus}_${species}_db ; fi; done < ./docs/extreme_species_list.txt
#  
## Calcualting Alpha Diversity

shannon <- read.csv(
  "./tmpalign.mafft.shannon", sep="\t", header = F, 
  col.names = c("pos", "region", "entropy", "entropy_subset"), 
  stringsAsFactors = F) %>% 
  gather(key="subset", value="entropy", -pos, -region) %>%
  mutate(grp = case_when(
    subset == "entropy" ~ "SILVA + focusDB",
    subset == "entropy_subset" ~ "SILVA"
  ))
str(shannon)

(p_diversity <- ggplot(shannon, aes(x=pos, color=grp, shape=grp, fill=region, y=entropy)) + 
    scale_shape_manual(values=c(21, 22)) + 
  geom_point(size=4, alpha=.5) + geom_smooth())
ggsave("docs/figures/4-diversity.png", p_diversity, device = "png", dpi = 300, width = 11, height = 7)

Assessing the provenance of the Silva strains

16s sequences in databases such a silva, greengenes,or RDP can come from several sources, namely: - complete genomes (in Refseq, Genbank, etc) - draft genomes (in the NCBI’s Assembly database) - 16s amplicons from Sanger sequencing - 16s amplicons from high-throughput amplicon sequencing.

As repeated rDNA operons are difficult to assembly, Draft genome rDNA can (and often do contain a single rDNA). This is problematic for species ideentification – the rDNA recovered is not just one of /n/ rDNAs, but it can be a consensus “summary” rDNA resulting from trying to assemble the repeated region.

riboSeed has been show to generate high-quality reconstructions of each rDNA when benchmarked against hybrid assemblies. Here, we compare the 16s seqeunces from riboSeed reassembly of draft genomes to the initial (potentially collapsed) 16s seqeunces.

Provenance of strains

First, we read in the silva headers for the strains. The header for the silva database constists of >ACCESSION.start.end name. After a fruitless effort to rework sraFind’s entrez calls to identify WGS master records with sraFind, I realized that info was already in pokaryotes.txt, and could be added as an afterthought. As of release 0.8, that column is present as “WGS”. this allows us to relate the seqeunces in the SILVA headers to their original SRAs, so we can compare riboSeed’s assemblies to those for which a complete genome is available, and assess the types of errors/collapses we see where only draft genomes are available.

# read in the draft linked accessions and the complete genome linked accessions
# if (!file.exists("sraFind.tab")){
#   system("curl -O sraFind.tab -L https://raw.githubusercontent.com/nickp60/sraFind/master/sraFind.tab")
# }
# 
srafind <- read.csv("sraFind.tab", sep="\t", stringsAsFactors = F)
#  make a boolean mask for all the silva seqs we might care about
ngs_silva <- datasets[["silva"]][["metadata"]][!is.na(datasets[["silva"]][["metadata"]]$matcher), ]
this_mask <- rep(FALSE, nrow(ngs_silva))
these_orgs <- paste(datasets[["fast"]][["metadata"]]$genus, datasets[["fast"]][["metadata"]]$species)
for (org in unique(these_orgs)){
  this_mask <- this_mask | grepl(org, ngs_silva$raw, fixed=T)
}
table(this_mask)
## this_mask
##  FALSE   TRUE 
## 177975  80360
silv_sub <- ngs_silva[this_mask,]
# We take any NZ_ prefix off the complete genome recods.  If you know why silva uses genbank,
# raise an issue on github, using the codeword "vermillion"
srafind$matcher <- ifelse(endsWith(srafind$WGS, "01"), srafind$WGS, 
                          gsub("NZ_", "", gsub("(.*?)\\.(.*)", "\\1", srafind$nuccore_first_chrom), fixed = T))
#View(srafind[grepl(".*_,*",  srafind$nuccore_first_chrom), ])
assembly_summary <- inner_join(
  silv_sub, 
  srafind %>% select(Assembly.Accession, matcher, platform,  run_SRAs) %>% filter(!is.na(run_SRAs)), 
  by="matcher")

(p_summary_draft <- assembly_summary  %>% filter(wgs) %>% group_by(strain,  Assembly.Accession) %>% 
     select(strain, Assembly.Accession, id, start, stop) %>% distinct() %>%  summarize(n=n()))
## # A tibble: 25,678 x 3
## # Groups:   strain [7,162]
##    strain                     Assembly.Accession     n
##    <chr>                      <chr>              <int>
##  1 Acidaminococcus fermentans GCA_900107075.1        1
##  2 Acidaminococcus fermentans GCA_900115425.1        1
##  3 Acinetobacter baumannii    GCA_000734775.1        1
##  4 Acinetobacter baumannii    GCA_000876445.1        1
##  5 Acinetobacter baumannii    GCA_001007685.1        1
##  6 Acinetobacter baumannii    GCA_001007705.1        1
##  7 Acinetobacter baumannii    GCA_001007725.1        1
##  8 Acinetobacter baumannii    GCA_001007745.1        1
##  9 Acinetobacter baumannii    GCA_001007765.1        1
## 10 Acinetobacter baumannii    GCA_001007815.1        1
## # … with 25,668 more rows
(p_summary_cg    <-    assembly_summary  %>% filter(cg) %>% group_by(strain,  Assembly.Accession) %>% 
     select(strain, Assembly.Accession, id, start, stop) %>% distinct() %>%  summarize(n=n()))
## # A tibble: 854 x 3
## # Groups:   strain [182]
##    strain                  Assembly.Accession     n
##    <chr>                   <chr>              <int>
##  1 Acinetobacter baumannii GCA_000814345.1        6
##  2 Acinetobacter baumannii GCA_000939415.2        6
##  3 Acinetobacter baumannii GCA_001026965.1        6
##  4 Acinetobacter baumannii GCA_001399655.1        6
##  5 Acinetobacter baumannii GCA_001514375.1        6
##  6 Acinetobacter baumannii GCA_001573065.1        6
##  7 Acinetobacter baumannii GCA_001573085.1        6
##  8 Acinetobacter baumannii GCA_001573105.1        6
##  9 Acinetobacter baumannii GCA_001573125.1        6
## 10 Acinetobacter baumannii GCA_001578145.1        6
## # … with 844 more rows
p_summary_n16s <- assembly_summary %>% select(raw, strain, Assembly.Accession,category) %>%distinct() %>% group_by(strain,  Assembly.Accession) %>%  mutate(n=n()) %>%  select(-raw) %>% distinct()
(p_n16s <-  ggplot(p_summary_n16s, aes(x=n,  fill=category)) + geom_density(adjust=4, alpha=.6) + 
    labs(x="Number of 16S  in assembly", y="Relative Distriubtion", fill="", title="Distribution of rDNA counts by assembly level") 
)

ggsave("docs/figures/S4-n16S.png", p_n16s, device = "png", dpi = 300, width = 6, height = 5)

table(p_summary_n16s[p_summary_n16s$category=="Complete Genome", "n"])
## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16 
##  28  58 357  98  37  78  60  80   7  11  11   1   8  14   3   3
table(p_summary_n16s[p_summary_n16s$category!="Complete Genome", "n"])
## 
##     1     2     3     4     5     6     7     8     9    10    11    12 
## 18818  1399  1358  1195  1329   802   399   212    77    34    21    15 
##    13    14 
##    14     5
# for example, 
# draft_fragilis <- p_summary_draft[grepl("fragilis", p_summary_draft$ScientificName),  ]
build_aln_for_assembly <- function(db, assembly="GCA_000699725.1", trimf=10, trimr=10){
  # get matching assembly from sraFind to get the complete genome/seqeunce accession
  this_silva <- unique(db[grepl(assembly, db$Assembly.Accession), "raw" ])
  if (length(this_silva) == 0) {
    warning(paste("No matches ", assembly, ";  could be long reads?"))
    return (list(NA, NA, NA, NA, NA, NA))
  }
  # paste tofgether to make a query string with or's; this is because for assemblies, 
  # # different contigs will have different sequence names, but a single assembly accession
  this_silva_query <- gsub("(.*)\\|$", "\\1", paste0(unique(gsub("(.*?)\\..*", "\\1", this_silva)), collapse = "", sep="|"))
  # get matching SRAs for assembly, so we can use them to search the focusdb results
  this_focus <- unique(unlist(strsplit(db[grepl(assembly,  db$Assembly.Accession), "run_SRAs" ],",")))
  # build focusdb query string;  get rid of the *_\\d suffix we add to SRAs to keep themunique
  this_focus_query <- gsub("(.*)\\|$", "\\1", paste0(gsub("_\\d+", "",this_focus), collapse = "", sep="|"))
  
  # identify all seqeunces associated with this assembly accesssion
  silva_subset_seqs <- datasets[['silva']][["set"]][grepl(this_silva_query,  datasets[['fast']][["metadata"]]$raw)]
  # idenify the seqeunces from focusDB
  focus_subset_seqs <- datasets[['fast']][["set"]][grepl(this_focus_query, datasets[['fast']][["metadata"]]$raw)]
  unique_focus_subset_seqs <- unique(focus_subset_seqs)
  if (length(unique_focus_subset_seqs) == 0) {
    warning(paste("No matches for ",this_focus_query,"(", assembly, ")  in focusDB results"))
    return (list(NA, NA, NA, NA, NA, NA))
  }
  # merge, align the seqeunces
  # Silva by default is in RNA, so we type-cast to DNA
  # We add the silva seqs fist so we can check uniqueness by the presense of SRA seqs in the unique'd alignment
  aln <- DECIPHER::AlignSeqs(
    append(DNAStringSet(silva_subset_seqs), unique_focus_subset_seqs), 
  )
  trimmed <- subseq(aln,trimf, -trimr)
  uniques <- trimmed[!duplicated(trimmed)]
  
  # identity <- data.frame(name=names(focus_subset_seqs), stringsAsFactors = F, unique=F )
  # for (seq in identity$name){
  #   focus_subset_seqs[seq, "unique"] <- mapply(function(x, y ) sum(x!=y), strsplit(silva_subset_seqs,""), strsplit(focus_subset_seqs[seq],""))
  # }
  # 
  #BrowseSeqs(uniques, highlight=0)
  dists <- stringDist(uniques, method="hamming")
  min_snps <- list()
  for (i in names(uniques)){
    if (i %in% names(unique_focus_subset_seqs)){
      these_snps <- as.matrix(dists)[grepl(i, labels(dists))]
      min_snps[i] <-  min(these_snps[these_snps > 0])
    }
  }
  return (list("alignment"=aln, 
               "unique_from_focusDB"=sum(names(unique_focus_subset_seqs) %in% names(uniques)), 
               "total"=length(focus_subset_seqs),
               "focusDB"=length(unique_focus_subset_seqs),
               "silva"=length(silva_subset_seqs),
               "min_snps"=min_snps
               )
  )
}
write_msa <- function(db, assembly="GCA_000699725.1", destdir){
  # get matching assembly from sraFind to get the complete genome/seqeunce accession
  this_silva <- unique(db[grepl(assembly, db$Assembly.Accession), "raw" ])
  if (length(this_silva) == 0) {
    warning(paste("No matches ", assembly, ";  could be long reads?"))
    return (list(NA, NA, NA, NA, NA, NA))
  }
  # paste tofgether to make a query string with or's; this is because for assemblies, 
  # # different contigs will have different sequence names, but a single assembly accession
  this_silva_query <- gsub("(.*)\\|$", "\\1", paste0(unique(gsub("(.*?)\\..*", "\\1", this_silva)), collapse = "", sep="|"))
  # get matching SRAs for assembly, so we can use them to search the focusdb results
  this_focus <- unique(unlist(strsplit(db[grepl(assembly,  db$Assembly.Accession), "run_SRAs" ],",")))
  # build focusdb query string;  get rid of the *_\\d suffix we add to SRAs to keep themunique
  this_focus_query <- gsub("(.*)\\|$", "\\1", paste0(gsub("_\\d+", "",this_focus), collapse = "", sep="|"))
  
  # identify all seqeunces associated with this assembly accesssion
  silva_subset_seqs <- silva[grepl(this_silva_query,  names(silva))]
  # idenify the seqeunces from focusDB
  focus_subset_seqs <- all_focus[grepl(this_focus_query, names(all_focus))]
  unique_focus_subset_seqs <- unique(focus_subset_seqs)
  if (length(unique_focus_subset_seqs) == 0) {
    warning(paste("No matches for ",this_focus_query,"(", assembly, ")  in focusDB results"))
    return (list(NA, NA, NA, NA, NA, NA))
  } else{
    Biostrings::writeXStringSet(
      x=append(DNAStringSet(silva_subset_seqs), unique_focus_subset_seqs),
      filepath = file.path(destdir, paste0(assembly, ".msa")),
      append = F, 
      format = "fasta")
  }
}
build_pairwise_network <- function(db, assembly="GCA_000699725.1"){
  # get matching assembly from sraFind to get the complete genome/seqeunce accession
  this_silva <- unique(db[grepl(assembly, db$Assembly.Accession), "raw" ])
  if (length(this_silva) == 0) {
    warning(paste("No matches ", assembly, ";  could be long reads?"))
    return (list(NA, NA, NA, NA, NA, NA))
  }
  # paste tofgether to make a query string with or's; this is because for assemblies, 
  # # different contigs will have different sequence names, but a single assembly accession
  this_silva_query <- gsub("(.*)\\|$", "\\1", paste0(unique(gsub("(.*?)\\..*", "\\1", this_silva)), collapse = "", sep="|"))
  # get matching SRAs for assembly, so we can use them to search the focusdb results
  this_focus <- unique(unlist(strsplit(db[grepl(assembly,  db$Assembly.Accession), "run_SRAs" ],",")))
  # build focusdb query string;  get rid of the *_\\d suffix we add to SRAs to keep themunique
  this_focus_query <- gsub("(.*)\\|$", "\\1", paste0(gsub("_\\d+", "",this_focus), collapse = "", sep="|"))
  
  # identify all seqeunces associated with this assembly accesssion
  silva_subset_seqs <- datasets[["silva"]][["set"]][grepl(this_silva_query,  names(datasets[["silva"]][["set"]]))]
  # idenify the seqeunces from focusDB fast, align them, keep track of best alignments
  these_results = data.frame("set"=character(), "focusDB"=character(), "silva"=character(), "pkb"=numeric(), stringsAsFactors = F)
  for (thisset in c("fast", "full", "denovo")){
    focus_subset_seqs <-  datasets[[thisset]][["set"]][grepl(this_focus_query, names( datasets[[thisset]][["set"]]))]
    if (length(focus_subset_seqs) == 0) next
    for (i in 1:length(focus_subset_seqs)){
      overall_best_score = 0
      for (j in 1:length(this_silva)){
        best_score = pairwiseAlignment(
          DNAStringSet(focus_subset_seqs[i]), 
          DNAStringSet(silva_subset_seqs[j]), 
          scoreOnly=TRUE, 
          substitutionMatrix=nucleotideSubstitutionMatrix(match = 1, mismatch = 0, baseOnly = FALSE, type = "DNA"),
          gapOpening = 0, gapExtension = 0,
          type="overlap")
        maxlen = min(length(focus_subset_seqs[[i]]), length(silva_subset_seqs[[j]]))
        if (length(silva_subset_seqs[[j]]) < 1400) next
        # say 1500 is max, score is 1490.  thats 10 SNPs, or 6.6 SNPs per kbp
        if  (best_score >  overall_best_score){
          this_row <-  data.frame(
            "set"=thisset,
            "focusDB"=names(focus_subset_seqs[i]),
            "silva"=names(silva_subset_seqs[j]),
            "pkb"=((maxlen - best_score ) * 1000) / maxlen, 
            stringsAsFactors = F)
        }
        
      }
      # we only add the top hit for each focusDB seqeunce
      # if no good hits exist, thisrow wont exist
      if (exists("this_row")){
        these_results <- rbind(
          these_results,
          this_row)
      }
    }
  }
  if (nrow(these_results) == 0) {
    warning(paste("No focusDB matches", assembly))
    these_results <- data.frame(
      "set"=NA,
      "focusDB"=NA,
      "silva"=NA,
      "pkb"=NA,  
      stringsAsFactors = F
      )
  }
  
  these_results$assembly <- assembly
  return(these_results)
  
}

Next we subsetted the dataframe to include only those that had sras from this study,(and by proxy, those that were seqeunced with illumina)

only_illumina_assembly_summary <- assembly_summary # [assembly_summary$platform == "ILLUMINA", ]
only_illumina_assembly_summary$processed_in_this_study <- FALSE
these_sras <- unique(gsub("(.*)_(.*)", "\\1", datasets[['fast']][["metadata"]]$id))
for (sra in these_sras){
  only_illumina_assembly_summary$processed_in_this_study <- grepl(
    sra, only_illumina_assembly_summary$run_SRAs,  fixed = T)  |  only_illumina_assembly_summary$processed_in_this_study
}
table(only_illumina_assembly_summary$processed_in_this_study )
## 
## FALSE  TRUE 
## 78351  1169
only_illumina_assembly_summary <- only_illumina_assembly_summary%>% filter(processed_in_this_study) %>%
  filter(cg)

Alright, so now we know which SRAs relate to which assemblies, and which we have analyzed. The first order of business is to compare the accuracy of fast mode (using riboSeed’s --just_seed parameter) to 16S’s from full assemblies. However, for a full analysis, we also need to compare to the de novo assembly.

for i in results/2019-11-11-results/*_genus/*/
do 
echo $i
thissra=$(basename $i)
contigs="${i}/results/riboSeed/seed/final_de_novo_assembly/contigs.fasta"
python ../py16db/make_silva_style_db_from_contigs.py --name $thissra $contigs  ./results/de_novo_seqs.fasta
done
# dest_dir <- file.path("docs", "ref_msas")
# dir.create( dest_dir)
snp_results = data.frame(
  "set" = character(),
  "focusDB" = character(),
  "silva" = character(),
  "assembly" = character(),
  "pkb" = numeric(),
  stringsAsFactors = F
)
nassemb <- length(unique(only_illumina_assembly_summary$Assembly.Accession))
counter = 0
for (i in 1:nrow(only_illumina_assembly_summary)){
#for (i in 1:50){
  assembly <- only_illumina_assembly_summary$Assembly.Accession[i]
  if (assembly %in% snp_results$assembly){
    next
  } else{
    print(paste(counter, "of", nassemb))
    snp_results <- rbind(
      snp_results,
      build_pairwise_network(db = only_illumina_assembly_summary,
                             assembly = assembly)
    )
    counter <- counter + 1
  }
}
## [1] "0 of 69"
## [1] "1 of 69"
## [1] "2 of 69"
## [1] "3 of 69"
## [1] "4 of 69"
## [1] "5 of 69"
## [1] "6 of 69"
## [1] "7 of 69"
## [1] "8 of 69"
## [1] "9 of 69"
## [1] "10 of 69"
## [1] "11 of 69"
## [1] "12 of 69"
## [1] "13 of 69"
## [1] "14 of 69"
## [1] "15 of 69"
## [1] "16 of 69"
## [1] "17 of 69"
## [1] "18 of 69"
## [1] "19 of 69"
## [1] "20 of 69"
## [1] "21 of 69"
## [1] "22 of 69"
## [1] "23 of 69"
## [1] "24 of 69"
## [1] "25 of 69"
## [1] "26 of 69"
## [1] "27 of 69"
## [1] "28 of 69"
## [1] "29 of 69"
## [1] "30 of 69"
## [1] "31 of 69"
## [1] "32 of 69"
## [1] "33 of 69"
## [1] "34 of 69"
## [1] "35 of 69"
## [1] "36 of 69"
## [1] "37 of 69"
## [1] "38 of 69"
## [1] "39 of 69"
## [1] "40 of 69"
## [1] "41 of 69"
## [1] "42 of 69"
## [1] "43 of 69"
## [1] "44 of 69"
## [1] "45 of 69"
## [1] "46 of 69"
## [1] "47 of 69"
## [1] "48 of 69"
## [1] "49 of 69"
## [1] "50 of 69"
## [1] "51 of 69"
## [1] "52 of 69"
## [1] "53 of 69"
## [1] "54 of 69"
## [1] "55 of 69"
## [1] "56 of 69"
## [1] "57 of 69"
## [1] "58 of 69"
## [1] "59 of 69"
## [1] "60 of 69"
## [1] "61 of 69"
## [1] "62 of 69"
## [1] "63 of 69"
## [1] "64 of 69"
## [1] "65 of 69"
## [1] "66 of 69"
## [1] "67 of 69"
## [1] "68 of 69"
# remove NAs
(p_snps <- ggplot(snp_results %>% filter(!is.na(pkb)), aes(x=set )) + 
    mytheme +
    geom_bar(aes(y=..count../10), fill="grey80")+ 
    geom_label(stat='count', aes(y=(..count../10) +20, label=paste(..count.., "\nsequences")))+ 
    geom_boxplot(aes(y=pkb, color=set), color="grey30", outlier.shape = NA, outlier.colour = "grey") + 
    geom_jitter(aes(y=pkb, color=set), alpha=.3, width = .2, height = .02) +  
    labs(title="Subassembled rDNAs are more accurate than rDNAs from final assemblies", 
         subtitle="gray bars show the relative number of sequences recovered",
         x= "focusDB  Mode",
         color="Assembly mode",
         y="SNPs per kbp \n(and relative ammount of seqeunces recovered)")
  )

snp_results$source <- gsub("(.*?)\\s.*", "\\1", snp_results$focusDB)
snp_results$target <- gsub("(.*?)\\s.*", "\\1", snp_results$silva)
seqs_aligned_against_cg <- snp_results %>% filter(set=="fast") %>% nrow()
n_exact_seqs <-  snp_results %>% filter(set=="fast") %>% filter(pkb==0) %>% nrow()
(p_snps_hist <- ggplot(snp_results %>% filter(!is.na(pkb)) %>%filter(set=="fast"), aes(x=pkb)) + 
    mytheme +
    geom_histogram() +
    labs(
      x="SNPs/Kbp (per genome)", y="Number of Assemblies",
      title="Comparing riboSeed-assembled 16S to Complete Genome's 16S",
      subtitle=paste0(
      "In the genera considered, ", 
      nrow(only_illumina_assembly_summary), 
      " sequences from ", nassemb, " complete genomes were present in SILVA.\n",
      "riboSeed recovered ",seqs_aligned_against_cg, " sequences, of which ",  n_exact_seqs, " were exact.")
    )
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave("docs/figures/S5-set-accuracy.png", p_snps, device = "png", dpi = 300, width = 6, height = 5)
ggsave("docs/figures/4-fast-cg-accuracy.png", p_snps_hist, device = "png", dpi = 300, width = 8, height = 6)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
write.table(snp_results, file="./docs/snps_per_kbp.tab", sep="\t", row.names=F)

Now, we build a interactive graph.

# install.packages("visNetwork")
library(visNetwork)
graph_res <- snp_results %>% filter(assembly %in% c(snp_results$assembly[1:1000]))
tres <- graph_res %>% select(-focusDB, -silva) %>%
  gather(key="x", value="node", -pkb, -assembly, -set) 
nodes <- data.frame(
  id = tres$node, 
  #roup=interaction(tres$assembly, tres$set),
  group=ifelse(tres$node %in% graph_res$target, "Ref", "Assembly"),
  label=gsub("(.*?) (.*)", "\\1", tres$node),
  shape= ifelse(
    tres$node %in% graph_res$target, 
    "pentagon", 
    case_when(
      tres$set == "full" ~ "square",
      tres$set == "fast" ~ "triangle",
      tres$set == "denovo" ~ "diamond")),
  size=10
  )
#nodes <- nodes %>% distinct() %>% as.data.frame()
nodes <- nodes[!duplicated(nodes$id),]
edges <- data.frame(from = graph_res$source, to = graph_res$target, length=graph_res$pkb^2, weights=graph_res$pkb, label=graph_res$pkb)
visNetwork(nodes, edges, width = "100%", ) %>% 
  visLegend() %>%  
  visOptions(highlightNearest = TRUE, nodesIdSelection = TRUE) %>%   
  visInteraction(hover = T) %>%
  visEvents(hoverEdge = "function(e){
    this.body.data.edges.update({id: e.edge, font: {size : 14}});
  }") %>%
  visEvents(blurEdge = "function(e){
    this.body.data.edges.update({id: e.edge, font: {size : 0}});
  }")
# install.packages("networdD3")
library(networkD3)
data(MisLinks)    
data(MisNodes)
graph_res <- snp_results %>% filter(assembly %in% c(results$assembly[1:105]))
tres <- graph_res %>% select(-focusDB, -silva) %>%
  gather(key="x", value="node", -pkb, -assembly, -set) 
nodes <- data.frame(
  NodeID = tres$node, 
  Group=tres$assembly,
  label=gsub("(.*?) (.*)", "\\1", tres$node)
)
nodes <- nodes[!duplicated(nodes$NodeID),]
edges <- data.frame(Source = graph_res$source, target = graph_res$target, Value=graph_res$pkb)
# MisLinks$source <- as.character(MisLinks$source)
# forceNetwork(Links = MisLinks, Nodes = MisNodes,
#             Source = "source", Target = "target",
#             Value = "value", NodeID = "name",
#             Group = "group", opacity = 0.8)
forceNetwork(Links = edges, Nodes = nodes,
            #Source = "source", target = "target",
            #Value = "Value",
            NodeID = "NodeID",
            Group = "Group", 
            #opacity = 0.8
            )
# going row by row reprocesses assemblies, so we skip if we have already processed it. because we 
# are subsetting from the same dataset in the function, this allows us to keep everything in order (as opposed to having 
# a separate list of unique assemblies, then subsetting both in the function and in the results assignment)
# structure for results --  a pretty dataframe and a list of of the alignments
reassembly_results <- data.frame(assembly=NA, unique_focusdb=NA, total=NA, focusdb=NA, silva=NA, category=NA, min_snps=NA,  stringsAsFactors = F)
reassembly_results <-  reassembly_results[!is.na(reassembly_results), ]
alignments <- list()
min_snps <- list()
counter = 0
for (i in 1:nrow(only_illumina_assembly_summary)){
  assembly <- only_illumina_assembly_summary$Assembly.Accession[i]
  if (assembly %in% reassembly_results$assembly){
    next
  } else{
    print(paste(counter, "of", nassemb ))
    res <- build_aln_for_assembly(
      db=only_illumina_assembly_summary, 
      assembly=assembly, trimf=10, trimr=10
    )
    alignments[[assembly]] <-  res[[1]]
    min_snps[[assembly]] <- res[[6]]
    reassembly_results <- rbind(
      reassembly_results, 
      data.frame(assembly=assembly,unique_focusdb=res[[2]], total=res[[3]], 
                 category=only_illumina_assembly_summary$category[i], 
                 focusdb=res[[4]], silva=res[[5]], stringsAsFactors = F)
    )
    counter <- counter + 1
  }
}
table("status"=reassembly_results$category, "New Seqs Introduced"=reassembly_results$unique_focusdb !=0)
ggplot(reassembly_results, aes(x=unique_focusdb, fill=category)) + geom_density(alpha=.5)

snps <- sapply(min_snps, function(x){return(sum(unlist(x)))})
snpdf <- data.frame(snps=snps, assembly=names(snps))
reassembly_results_df <- merge(reassembly_results, snpdf, by="assembly")
reassembly_results_df <- reassembly_results_df[!is.na(reassembly_results_df$total),]

total_cg_seqs  <-sum(
  reassembly_results_df[reassembly_results_df$category == "Complete Genome", "silva"])
total_focus_seqs  <-sum(
  reassembly_results_df[reassembly_results_df$category == "Complete Genome", "focusdb"])
ggplot(reassembly_results_df %>% filter(category == "Complete Genome"), aes(x=snps)) + geom_histogram()

(snps_cg_kbp <- ggplot(
  reassembly_results_df %>% filter(category == "Complete Genome") %>% 
    #filter(snps < 500) %>% 
    mutate(pkbp=(snps * 1000)/(focusdb * 1500)), 
  aes(x=pkbp)) + geom_histogram() + labs(
    x="SNPs/Kbp (per genome)", y="Number of Assemblies",
    title="Comparing riboSeed-assembled 16S to Complete Genome's 16S",
    subtitle=paste("In the genera considered, sequences for complete genomes were present in SILVA.\nriboSeed recovered 182 of 253 sequences.\nIn 14 genomes,16S alleles were identical between riboSeed and complete genome")
  )
)
(snps_cg_raw <- ggplot(
  reassembly_results_df %>% filter(category == "Complete Genome") %>% 
    filter(snps < 500) %>% mutate(per=(snps )/(focusdb)), 
  aes(x=per)) + geom_histogram() + labs(
    x="mean SNPs", y="Number of Assemblies",
    title="Comparing riboSeed-assembled 16S to Complete Genome's 16S"
  )
)



ggsave("docs/figures/X-snps.png", snps_cg_kbp, device = "png", 
       dpi = 300, width = 7, height = 5)

reassembly_results[reassembly_results$assembly > 0, ]
# GCA_000767745.1 GCA_000832805.1
for (not_great in reassembly_results[reassembly_results$category=="Complete Genome" & reassembly_results$unique_focusdb != 0, "assembly"]){
  BrowseSeqs(subseq(alignments[[not_great]][!duplicated(alignments[[not_great]])], 10, -20), highlight=0)
  invisible(readline(prompt="Press [enter] to continue"))
  #distmat <- DECIPHER::DistanceMatrix(subseq(alignments$GCA_000832805.1, 10, -20)) #%>% as.data.frame() %>%
  #rownames_to_column("strainA") %>% gather(key="strainB", value="dist", -strainA)
  snp_mat <- stringDist(subseq(alignments[[not_great]], 10, -20), method="hamming")
  gplots::heatmap.2(as.matrix(snp_mat))
  #heatmap(as.matrix(snp_mat ))
}
View(merge(reassembly_results, srafind, by.x="assembly", by.y="Assembly.Accession"))

Discussion

sraFind_data <- read.table("~/.focusDB/sraFind.tab", header=T, sep="\t", stringsAsFactors = F)
tbassembled <- sraFind_data %>% filter (nuccore_first_chrom=="") %>% filter(!is.na(study_acc)) %>% nrow()
tbreleased  <- sraFind_data %>% filter (nuccore_first_chrom=="") %>% filter(is.na(study_acc)) %>% nrow()
sum(uniqueness[uniqueness$DB == "focus", "Unique"]) + sum(uniqueness[uniqueness$DB == "focus", "Duplicated"])
## [1] 5769

Execution time

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:IRanges':
## 
##     %within%
## The following objects are masked from 'package:S4Vectors':
## 
##     second, second<-
## The following object is masked from 'package:base':
## 
##     date
alog <- readLines("./docs/timing_results/HMP-combined_log.txt")
alog <- alog[1:length(alog)]
lo <- data.frame(dt=gsub("(.*?),.* - .*? - (.*)","\\1", alog), stringsAsFactors = F)
lo$message  <-  gsub("(.*?) - .*? - (.*)","\\2", alog)
lo$datetime  <- ymd_hms(lo$dt)
## Warning: 257 failed to parse.
# caused by broken lines in log
lo <- lo[!is.na(lo$datetime), ]
lo <- lo[grepl("Processing|Found SRAs|Downloading genomes|checking reference|Downloading|Obtaining average|Finding best|Quality trimming reads|Downsampling reads|Running riboSeed|Sum of|Skipping riboSeed", lo$message), ]
lo$bug <- ifelse(grepl("Processing \\D", lo$message),  gsub("Processing ","", lo$message), NA)
lo$sra <- ifelse(grepl("Downloading .*\\d+", lo$message),  gsub("Downloading ","", lo$message), ifelse(
    grepl("Processing", lo$message), "GLOBAL", NA))
lo <- lo %>% fill(sra) %>% fill(bug) %>% group_by(sra, bug) %>% mutate(keep = !any(grepl("Skipping riboSeed", message))) %>% filter(keep) %>% select(-keep)
# fix download by removing reference to SRA
lo$message <- gsub("Downloading .*\\d+","Downloading SRA", gsub("(\\D+?)\\s(\\D+?)[$ ].*","\\1 \\2", lo$message))
# fix riboseed messgage by removing number
lo$message <- gsub("Processing \\D+", "Processing org", lo$message)
# fix riboseed messgage by removing number
lo$message <- gsub("\\d*", "", lo$message)
# get rid of missing
lof <- lo %>%  group_by(bug) %>% mutate(remove = any(grepl("Quality trimming|Obtaining", message )& sra == "GLOBAL")) %>% filter(!remove) %>% select(-remove) %>% filter(bug != "Actinomyces odontolyticus")
lot <- lof %>% select(-dt) %>% filter(sra != "SRR1819776") %>% spread(key=message, value=datetime)
lot$org_done = lot$`Downloading SRA`[1]
for (i in 1:nrow(lot)){
  if (i !=nrow(lot)){
    if (lot$sra[i] != "GLOBAL"){
      if (lot$sra[i+1]  !=  "GLOBAL"){
        # if next item is another SRR
        lot$org_done[i] <- lot$`Downloading SRA`[i+1]
      }else {
        # If next item is a global message
        lot$org_done[i] <- lot$`Processing  riboSeed runs`[i+1]
      } 
    }
  }
}
# per SRA steps
lot$Download_SRA <- difftime(lot$`Quality trimming` ,  lot$`Downloading SRA`, units="m")
lot$Trim <- difftime(lot$`Downsampling reads`, lot$`Quality trimming`, units="m")
#lot$Downsample<- lot$org_done  - lot$`Downsampling reads`
# global steps
lot$Assemble <- difftime(lot$`Sum of`, lot$`Processing  riboSeed runs`, units = "m")
lot$Download_refs <- difftime(lot$`checking reference`, lot$`Downloading genomes`, units="m")
lot$Find_SRAs <- difftime(lot$`Found SRAs:`, lot$`Processing org`, units="m")

tlot  <- lot %>% select(bug, sra, Download_SRA, Trim, Assemble, Download_refs, Find_SRAs) %>%
  gather(key="step", value="s", -bug, -sra) %>%
  filter(!is.na(s)) %>%
  mutate(stage = ifelse(grepl("ad_SRA|Trim|Downsample", step), "SRA", "Global"))


(p_timing <- ggplot(tlot, aes(x=step, y=s, color= bug)) + 
    geom_boxplot(color="black", fill=NA) +
    geom_jitter(width = .2, alpha=.4) + 
    scale_y_continuous() +
    scale_color_discrete(guide="none") +
    coord_flip() + 
    labs(title="Time per stage", x="Step", y="Minutes") +
    facet_wrap(~stage, scales="free"))  

ggsave("docs/figures/S3-time.png", p_timing, device = "png", 
       dpi = 300, width = 11, height = 7)

very rough timing estimates, for fast mode

time_means <- tlot %>% group_by(step) %>% summarise(mean_sec=mean(s))
98329 * sum(time_means$mean_sec)
## Time difference of 2237832 mins
as.numeric(98329 * sum(time_means$mean_sec), units="days") 
## [1] 1554.05